*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* Replication Files to: P-hacking in clinical trials and how incentives shape the distribution of results across phases 
* Date: April 2020
* Authors: Jérôme Adda, Christian Decker, and Marco Ottaviani
*
*	- Input: 'linked_phases.dta', 'data_counterfactual_analysis.dta'
* 			 		 
*	- Output: 'data_selection_functions.dta' & Figure 3 & Table 1
*			 
* Topic: Estimate Selection Functions
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*

*------------------------*
* Section 0 -- Directory *
*------------------------*

clear all
clear matrix
cap log close
set more off

*-----*
* 0.1 *
*-----*

* Run the dofiles which defines all the globals for the folder's and sub-folders
* paths to both data and analysis.

if c(username) == "chrdec" {													
	do "C:/Users/chrdec/Dropbox/clinical trials/stata/PNAS revision/2_analysis/dofiles/0_globals.do"   			// 0_globals.do path

}

*-----*
* 0.2 * 
*-----*

* Set up the directory where data are stored
macro list
cd "${data}"
set matsize 11000

cap log off
log using "${run}${log}log_4.txt", replace

*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~

*-------------------*
* Table of content: *
*-------------------*
* Sections: 
* 1. Prepare Data
* 2. Estimate and Tabulate 
* 3. Figure 3


*---------------------------*
* Section 1 -- Prepare Data *
*---------------------------*

use "${data}${final}linked_phases.dta", clear
keep if Ph2==1
keep nct_id matched 

merge 1:m nct_id using "${data}${final}data_counterfactual_analysis.dta", keepusing(nct_id INDUSTRY p z z_modifier D0 D1 D2 sqrt_enrollment top10rev p_adjust placebo Ph2 completion_year mesh_code rev_rank rxsales_rank rdspend_rank rep_rank)
keep if _merge==3
drop _merge
keep if Ph2==1


** generate dummies for completioan year and mesh category
tab completion_year
levelsof completion_year if completion_year>2006 & completion_year<2019, local(cys) // omitted category: before 2007
foreach y of local cys {
	gen cy_`y'=0
 	replace cy_`y'=1 if completion_year==`y'
}
replace cy_2018=1 if completion_year>2018 // cy_2018 means 2018 or later



tab mesh_code
gen mesh_aux=subinstr(mesh_code,"/","_",10)
levelsof mesh_aux if mesh_code!="C01" & mesh_code!="C02" & mesh_code!="C03" & mesh_code!="C11" & mesh_code!="C15" & mesh_code!="C24" & mesh_code!="C26" & mesh_code!="missing", local(code) clean // omitted category: missing/others (missing, C01,C02,C03,C11,C15,C24,C26)
gen mesh_cat=1
local count=1
foreach c of local code {
	local count=`count'+1
	gen mc_`c'=0
 	replace mc_`c'=1 if mesh_aux=="`c'"
 	replace mesh_cat=`count' if mesh_aux=="`c'"
}
drop mesh_aux

preserve
sum matched if INDUSTRY==1
sum matched if INDUSTRY==0
restore


save "${data}${final}data_selection_functions.dta", replace


*------------------------------------*
* Section 2 -- Estimate and Tabulate *
*------------------------------------*

use "${data}${final}data_selection_functions.dta",clear

*-----*
* 2.1 *
*-----*

** preparation

gen SPLIT=top10rev

gen aux=1 if D0==1 & z_modifier!=0
count if aux==1
local N=r(N)
sort aux z
forvalues j=1(1)`N' {
if z_modifier[`j']==-1 {
qui sum z if z<=z[`j'] & z_modifier==0 ,d
qui replace z=r(mean) in `j'
}
if z_modifier[`j']==1 {
qui sum z if z>=z[`j'] & z_modifier==0, d
qui replace z=r(mean) in `j'
}
}
drop aux


sum matched if INDUSTRY==1
local mean_IND=r(mean)
sum matched if INDUSTRY==1 & SPLIT==0
local mean_small=r(mean)
sum matched if INDUSTRY==1 & SPLIT==1
local mean_top10=r(mean)

gen z_Ph2=D0*z



*-----*
* 2.2 *
*-----*

** estimation with joint sample (to get p-value of Wald test for difference between small and large sponsors)

gen large=SPLIT
gen large_z_Ph2=SPLIT*z_Ph2
gen large_D1=SPLIT*D1
gen large_D2=SPLIT*D2
gen sqrt_enrollment_large=sqrt_enrollment*large
gen placebo_large=placebo*large
gen p_adjust_large=p_adjust*large

local covariates z_Ph2 D1 D2 large* sqrt_enrollment* placebo* p_adjust* cy_*##large mc_*##large
local stderr "cl mesh_cat"

logit matched `covariates' if INDUSTRY==1, vce(`stderr')
test large_z_Ph2 large_D1 large_D2 large
local pWald = r(p)


*-----*
* 2.2 *
*-----*

** estimation with split sample => Table 1

local covariates z_Ph2 D1 D2 sqrt_enrollment placebo p_adjust cy_* mc_*
local path "${run}${table}tab1.xls"
local stderr "cl mesh_cat"


logit matched `covariates' if INDUSTRY==1, vce(`stderr')
unique nct_id if INDUSTRY==1
outreg2 using "`path'", replace adds("Mean dep. var.",`mean_IND', "No. of trials" ,r(sum)) ct("All","industry") nocons keep(z_Ph2 D1 D2) addtext(Controls,yes,Completion Year FE,yes,Mesh Condition FE,yes)
logit matched `covariates' if INDUSTRY==1 & SPLIT==0, vce(`stderr')
unique nct_id if INDUSTRY==1 & SPLIT==0
outreg2 using "`path'", append adds("Mean dep. var.",`mean_small', "No. of trials" ,r(sum)) ct("Small","industry") nocons keep(z_Ph2 D1 D2) addtext(Controls,yes,Completion Year FE,yes,Mesh Condition FE,yes)
logit matched `covariates' if INDUSTRY==1 & SPLIT==1, vce(`stderr')
unique nct_id if INDUSTRY==1 & SPLIT==1 
outreg2 using "`path'", append adds("Mean dep. var.",`mean_top10', "No. of trials" ,r(sum), "p-value Wald Test (2) vs. (3)",`pWald') ct("Top 10","industry") nocons keep(z_Ph2 D1 D2) addtext(Controls,yes,Completion Year FE,yes,Mesh Condition FE,yes)


*-----------------------*
* Section 3 -- Figure 3 *
*-----------------------*

qui logit matched `covariates' if INDUSTRY==1, vce(`stderr')
qui margins, predict() at(z_Ph2=(0(0.05)4) D1=0 D2=0 (mean) sqrt_enrollment placebo p_adjust cy_2007 cy_2008 cy_2009 cy_2010 cy_2011 cy_2012 cy_2013 cy_2014 cy_2015 cy_2016 cy_2017 cy_2018 mc_C04 mc_C05 mc_C06 mc_C07 mc_C08_C09 mc_C10 mc_C12_C13 mc_C14 mc_C17 mc_C18 mc_C19 mc_C20 mc_C23 mc_F03)
matrix mat_all = r(table)
matrix mat_grid = r(at)
gen grid = mat_grid[_n,1] in 1/81
gen b_all = mat_all[1,_n] in 1/81
gen ll_all = mat_all[5,_n] in 1/81
gen ul_all = mat_all[6,_n] in 1/81

qui logit matched `covariates' if INDUSTRY==1 & SPLIT==0, vce(`stderr')
qui margins, predict() at(z_Ph2=(0(0.05)4) D1=0 D2=0 (mean) sqrt_enrollment placebo p_adjust cy_2007 cy_2008 cy_2009 cy_2010 cy_2011 cy_2012 cy_2013 cy_2014 cy_2015 cy_2016 cy_2017 cy_2018 mc_C04 mc_C05 mc_C06 mc_C07 mc_C08_C09 mc_C10 mc_C12_C13 mc_C14 mc_C17 mc_C18 mc_C19 mc_C20 mc_C23 mc_F03)
matrix mat_small = r(table)
gen b_small = mat_small[1,_n] in 1/81
gen ll_small = mat_small[5,_n] in 1/81
gen ul_small = mat_small[6,_n] in 1/81

qui logit matched `covariates' if INDUSTRY==1 & SPLIT==1, vce(`stderr')
qui margins, predict() at(z_Ph2=(0(0.05)4) D1=0 D2=0 (mean) sqrt_enrollment placebo p_adjust cy_2007 cy_2008 cy_2009 cy_2010 cy_2011 cy_2012 cy_2013 cy_2014 cy_2015 cy_2016 cy_2017 cy_2018 mc_C04 mc_C05 mc_C06 mc_C07 mc_C08_C09 mc_C10 mc_C12_C13 mc_C14 mc_C17 mc_C18 mc_C19 mc_C20 mc_C23 mc_F03)
matrix mat_large = r(table)
gen b_large = mat_large[1,_n] in 1/81
gen ll_large = mat_large[5,_n] in 1/81
gen ul_large = mat_large[6,_n] in 1/81

graph twoway (line b_all grid, lcolor(mint) lwidth(thick) lpattern(solid)) (line b_small grid, lcolor(eltgreen) lwidth(thick) lpattern(shortdash)) (line b_large grid, lcolor(forest_green) lwidth(thick) lpattern(longdash)) (rarea ll_all ul_all grid, fcolor(mint%20) lwidth(none)) (rarea ll_small ul_small grid, fcolor(eltgreen%20) lwidth(none)) (rarea ll_large ul_large grid, fcolor(Forest_green%20) lwidth(none)), legend(order(1 "All industry" 2 "Small industry" 3 "Top 10 industry") size(medlarge)) ytitle("Predicted continuation probability", size(medlarge)) xtitle("Phase II z-score", size(medlarge)) ylabel(0(0.1)0.6, format(%03.1f)) graphregion(color(white)) bgcolor(white) plotr(ls(none)) name(fig_selection,replace)
graph export "${run}${graph}fig3.png", replace width(1500)

log close 
cd "${run}${dofiles}"
